# Title: Measuring Interethnic Marriage in Africa
# Author: Branden Bohrnsen, bohrnsen@umich.edu
# Date: 1/30/2026
# Description: Calculate observed intermarriage rates and simple ELF adjustments.

library(tidyverse)
library(here)

weighted_se <- function(x, w, na.rm = TRUE) {
    if (na.rm) {
        complete_cases <- !is.na(x) & !is.na(w)
        x <- x[complete_cases]
        w <- w[complete_cases]
    }
    n <- length(x)
    if (n <= 1) return(NA)
    
    weighted_mean <- sum(w * x) / sum(w)
    weighted_var <- sum(w * (x - weighted_mean)^2) / sum(w)
    effective_n <- (sum(w))^2 / sum(w^2)

    sqrt(weighted_var / effective_n)
}

marriages <- read_csv(here("data", "clean", "sample.csv")) %>%
  mutate(Urban = Urbanization >= 0.5) %>%
  mutate(across(where(is.factor), as.character)) %>%
  mutate(Intergroup = Ethnicity_Head != Ethnicity_Spouse) %>%
  mutate(Interblock = EthnicBlock_Head != EthnicBlock_Spouse) %>%
  filter(FullReside == T) %>%
  filter(Polygamous_Head == "No more than one wife linked via SPLOC")

hhi_by_level <- read_csv(here("out", "inter", "hhi_by_level.csv"))
hhi_by_decade <- read_csv(here("out", "inter", "hhi_by_decade.csv"))

national_rates_group <- marriages %>%
  filter(!is.na(Ethnicity_Head)) %>%
  summarise(
    observed_rate = mean(Intergroup, na.rm = TRUE),
    observed_rate_se = sqrt(observed_rate * (1 - observed_rate) / n()),
    n_marriages = n(),
    .groups = "drop"
  ) %>%
  mutate(
    Level = "National",
    Group_Type = "Ethnic Group"
  )

national_rates_block <- marriages %>%
  filter(!is.na(EthnicBlock_Head)) %>%
  summarise(
    observed_rate = mean(Interblock, na.rm = TRUE),
    observed_rate_se = sqrt(observed_rate * (1 - observed_rate) / n()),
    n_marriages = n(),
    .groups = "drop"
  ) %>%
  mutate(
    Level = "National",
    Group_Type = "Ethnic Block"
  )

urban_rates_group <- marriages %>%
  filter(!is.na(Ethnicity_Head) & Urban == TRUE) %>%
  summarise(
    observed_rate = mean(Intergroup, na.rm = TRUE),
    observed_rate_se = sqrt(observed_rate * (1 - observed_rate) / n()),
    n_marriages = n(),
    .groups = "drop"
  ) %>%
  mutate(
    Level = "Urban",
    Group_Type = "Ethnic Group"
  )

urban_rates_block <- marriages %>%
  filter(!is.na(EthnicBlock_Head) & Urban == TRUE) %>%
  summarise(
    observed_rate = mean(Interblock, na.rm = TRUE),
    observed_rate_se = sqrt(observed_rate * (1 - observed_rate) / n()),
    n_marriages = n(),
    .groups = "drop"
  ) %>%
  mutate(
    Level = "Urban",
    Group_Type = "Ethnic Block"
  )

rural_rates_group <- marriages %>%
  filter(!is.na(Ethnicity_Head) & Urban == FALSE) %>%
  summarise(
    observed_rate = mean(Intergroup, na.rm = TRUE),
    observed_rate_se = sqrt(observed_rate * (1 - observed_rate) / n()),
    n_marriages = n(),
    .groups = "drop"
  ) %>%
  mutate(
    Level = "Rural",
    Group_Type = "Ethnic Group"
  )

rural_rates_block <- marriages %>%
  filter(!is.na(EthnicBlock_Head) & Urban == FALSE) %>%
  summarise(
    observed_rate = mean(Interblock, na.rm = TRUE),
    observed_rate_se = sqrt(observed_rate * (1 - observed_rate) / n()),
    n_marriages = n(),
    .groups = "drop"
  ) %>%
  mutate(
    Level = "Rural",
    Group_Type = "Ethnic Block"
  )

all_marriage_rates <- rbind(
  national_rates_group, national_rates_block,
  urban_rates_group, urban_rates_block,
  rural_rates_group, rural_rates_block
)

hhi_for_join <- hhi_by_level %>%
  pivot_longer(
    cols = c(Ethnic_Group_HHI, Ethnic_Block_HHI),
    names_to = "HHI_Type",
    values_to = "HHI"
  ) %>%
  mutate(
    Group_Type = case_when(
      HHI_Type == "Ethnic_Group_HHI" ~ "Ethnic Group",
      HHI_Type == "Ethnic_Block_HHI" ~ "Ethnic Block"
    )
  ) %>%
  select(Level, Group_Type, HHI, N_Group, N_Block)

basic_results <- all_marriage_rates %>%
  left_join(hhi_for_join, by = c("Level", "Group_Type")) %>%
  mutate(
    ELF = 1 - HHI,
    I_basic = observed_rate / ELF,
    I_basic_se = observed_rate_se / ELF,
    census_n = case_when(
      Group_Type == "Ethnic Group" ~ N_Group,
      Group_Type == "Ethnic Block" ~ N_Block
    )
  ) %>%
  select(Level, Group_Type, observed_rate, observed_rate_se, ELF, I_basic, I_basic_se, n_marriages, census_n)

decade_rates_group <- marriages %>%
  filter(!is.na(Ethnicity_Head) & !is.na(DecadeMarried)) %>%
  group_by(DecadeMarried) %>%
  summarise(
    observed_rate = mean(Intergroup, na.rm = TRUE),
    observed_rate_se = sqrt(observed_rate * (1 - observed_rate) / n()),
    n_marriages = n(),
    .groups = "drop"
  ) %>%
  mutate(Group_Type = "Ethnic Group")

decade_rates_block <- marriages %>%
  filter(!is.na(EthnicBlock_Head) & !is.na(DecadeMarried)) %>%
  group_by(DecadeMarried) %>%
  summarise(
    observed_rate = mean(Interblock, na.rm = TRUE),
    observed_rate_se = sqrt(observed_rate * (1 - observed_rate) / n()),
    n_marriages = n(),
    .groups = "drop"
  ) %>%
  mutate(Group_Type = "Ethnic Block")

all_decade_rates <- rbind(decade_rates_group, decade_rates_block)

hhi_decade_for_join <- hhi_by_decade %>%
  pivot_longer(
    cols = c(Ethnic_Group_HHI, Ethnic_Block_HHI),
    names_to = "HHI_Type", 
    values_to = "HHI"
  ) %>%
  mutate(
    Group_Type = case_when(
      HHI_Type == "Ethnic_Group_HHI" ~ "Ethnic Group",
      HHI_Type == "Ethnic_Block_HHI" ~ "Ethnic Block"
    )
  ) %>%
  select(DecadeMarried = Decade, Group_Type, HHI, N_Group, N_Block)

basic_results_decade <- all_decade_rates %>%
  left_join(hhi_decade_for_join, by = c("DecadeMarried", "Group_Type")) %>%
  mutate(
    ELF = 1 - HHI,
    I_basic = observed_rate / ELF,
    I_basic_se = observed_rate_se / ELF,
    census_n = case_when(
      Group_Type == "Ethnic Group" ~ N_Group,
      Group_Type == "Ethnic Block" ~ N_Block
    )
  ) %>%
  select(DecadeMarried, Group_Type, observed_rate, observed_rate_se, ELF, I_basic, I_basic_se, n_marriages, census_n)

decade_urban_rates_group <- marriages %>%
  filter(!is.na(Ethnicity_Head) & !is.na(DecadeMarried) & !is.na(Urban)) %>%
  group_by(DecadeMarried, Urban) %>%
  summarise(
    observed_rate = mean(Intergroup, na.rm = TRUE),
    observed_rate_se = sqrt(observed_rate * (1 - observed_rate) / n()),
    n_marriages = n(),
    .groups = "drop"
  ) %>%
  mutate(Group_Type = "Ethnic Group")

decade_urban_rates_block <- marriages %>%
  filter(!is.na(EthnicBlock_Head) & !is.na(DecadeMarried) & !is.na(Urban)) %>%
  group_by(DecadeMarried, Urban) %>%
  summarise(
    observed_rate = mean(Interblock, na.rm = TRUE),
    observed_rate_se = sqrt(observed_rate * (1 - observed_rate) / n()),
    n_marriages = n(),
    .groups = "drop"
  ) %>%
  mutate(Group_Type = "Ethnic Block")

all_decade_urban_rates <- rbind(decade_urban_rates_group, decade_urban_rates_block)

basic_results_decade_urban <- all_decade_urban_rates %>%
  left_join(hhi_decade_for_join, by = c("DecadeMarried", "Group_Type")) %>%
  mutate(
    ELF = 1 - HHI,
    I_basic = observed_rate / ELF,
    I_basic_se = observed_rate_se / ELF,
    census_n = case_when(
      Group_Type == "Ethnic Group" ~ N_Group,
      Group_Type == "Ethnic Block" ~ N_Block
    ),
    Geography = case_when(
      Urban == TRUE ~ "Urban",
      Urban == FALSE ~ "Rural"
    )
  ) %>%
  select(DecadeMarried, Geography, Group_Type, observed_rate, observed_rate_se, ELF, I_basic, I_basic_se, n_marriages, census_n)

summary_comparison <- basic_results %>%
  filter(Level == "National") %>%
  select(Group_Type, Basic_Observed = observed_rate, Basic_Observed_SE = observed_rate_se, Basic_ELF = ELF, Basic_Adjusted = I_basic, Basic_Adjusted_SE = I_basic_se) %>%
  mutate(Method = "Basic National HHI")

census <- read_csv(here("data", "clean", "census.csv"))
marriages_2000s <- marriages %>%
  filter(DecadeMarried == "2001-2010")

hhi_by_district_2000s_block <- census %>%
  filter(InMarket_00s == 1, !is.na(District), !is.na(EthnicBlock)) %>%
  count(District, EthnicBlock, name = "n_group") %>%
  group_by(District) %>%
  mutate(n_district = sum(n_group)) %>%
  ungroup() %>%
  mutate(prop_group = n_group / n_district) %>%
  group_by(District) %>%
  summarise(
    HHI = sum(prop_group^2),
    .groups = "drop"
  )

district_rates_2000s_block <- marriages_2000s %>%
  filter(!is.na(District)) %>%
  group_by(District) %>%
  summarise(
    observed_rate = mean(Interblock, na.rm = TRUE),
    n_marriages = n(),
    .groups = "drop"
  )

basic_by_district_block <- district_rates_2000s_block %>%
  left_join(hhi_by_district_2000s_block, by = "District") %>%
  mutate(
    ELF = 1 - HHI,
    I_basic = observed_rate / ELF
  ) %>%
  select(District, observed_rate, n_marriages, HHI, ELF, I_basic)

if (!dir.exists(here("out", "final"))) {
  dir.create(here("out", "final"), recursive = TRUE)
}

write_csv(basic_results, here("out", "final", "basic_by_geography.csv"))
write_csv(basic_results_decade, here("out", "final", "basic_by_decade.csv"))
write_csv(basic_results_decade_urban, here("out", "final", "basic_by_decade_urban_rural.csv"))
write_csv(basic_by_district_block, here("out", "final", "basic_by_district_block.csv"))

write_csv(summary_comparison, here("out", "final", "basic_summary.csv"))
